# author:    Yannick Stiller, A. Duer, Robert A. Huber
# contact:   robert.huber@sbg.ac.at
# file name: data.R
# Context:   Load and process the original data  

#### Load packages and create functions ####

library(tidyverse)
'%!in%' <- function(x,y)!('%in%'(x,y))
pew_names <- function(x, start){
  car::recode(substr(names(x), start, nchar(names(x))), 
              "'BAN' = 'BGD'; 'BRI' = 'GBR'; 'CHI' = 'CHN'; 'ESA' = 'SLV'; 'GER' = 'DEU';
              'GRE' = 'GRC'; 'INDIA' = 'IND'; 'INDO' = 'IDN'; 'LEB' = 'LBN'; c('MAL','MALA') = 'MYS';
              'NIG' = 'NGA'; 'PAL' = 'PSE'; 'PHI' = 'PHL'; 'RSA' = 'ZAF'; 'SKOR' = 'KOR';
              'SPA' = 'ESP'; 'TAN' = 'TZA'; 'THAI' = 'THA'; 'US' = 'USA'; 'VIE' = 'VNM'")
}

#### Data loading and cleaning ####

oD_lab <- foreign::read.spss("./third_party/Pew Research Global Attitudes Spring 2014 Dataset for Web.sav", use.value.labels = T, to.data.frame = T)
oD <- foreign::read.spss("./third_party/Pew Research Global Attitudes Spring 2014 Dataset for Web.sav", use.value.labels = F, to.data.frame = T)

df <- data.frame(id = 1:nrow(oD),
                 country = oD_lab$COUNTRY,
                 weight = oD$weight)

df$country_short <- countrycode::countrycode(df$country, origin = "country.name", destination = "iso3c")

df$country_year <- paste0(df$country_short, "_2012") # use two years prior to survey for regional data

#### Simple Variables ####

# Q13a Please tell me whether you completely agree, mostly agree, mostly disagree or completely disagree with the following statements: 
# a. Most people are better off in a free market economy, even though some people are rich and some are poor.
df$economic_left_right <- as.numeric(factor(oD_lab$Q13A, levels = c("Completely disagree", "Mostly disagree", "Mostly agree", "Completely agree")))

#Q133. How old were you at your last birthday?
df$age_num <- ifelse(oD$Q133 >97, NA, oD$Q133) # 98 = don't know; 99 = refused
df$age <- factor(car::recode(df$age_num, "18:40=0;41:65=1;66:97=2"),
                 levels = c(0:2),
                 labels = c("young", "middle", "old"))

#Q132. Gender
df$female <- oD_lab$Q132

#Q140. Which of the following employment situations best describes your current status?
df$employment <- ifelse(oD_lab$Q140 %in% c("Don't know", "Refused"), NA, as.character(oD_lab$Q140))
df$employment <- factor(ifelse(df$employment %in% c("In paid work", "Apprentice or trainee"), "Employed", "Unemployed"),
                        levels = c("Unemployed", "Employed"))

#### Trade and Distributional Consequences ####

#Q27. What do you think about the growing trade and business ties between (survey country) and other countries - do you think it is a very good thing, somewhat good, somewhat bad or a very bad thing for our country?
df$trade_support <- factor(ifelse(oD$Q27 > 4, NA, as.character(oD_lab$Q27)),
                           levels = c("Very bad", "Somewhat bad", "Somewhat good", "Very good"))

#Q28. Does trade with other countries lead to an increase in the wages of (survey nationality) workers, a decrease in wages, or does it not make a difference?
df$consequences_wages <- factor(ifelse(oD$Q28 > 3, NA, as.character(oD_lab$Q28)),
                                levels = c("Decrease", "Does not make a difference", "Increase"))

#Q29. Does trade with other countries lead to job creation in (survey country), job losses, or does it not make a difference?
df$consequences_jobs <- factor(ifelse(oD$Q29 > 3, NA, as.character(oD_lab$Q29)),
                               levels = c("Job losses", "Does not make a difference", "Job creation"))

#### Region of respondent ####
# No single variable available
# Combined regions of residency with information about whether the respondent lives in a urban or rural place so that we could use more fine-grained subnational trade competitiveness data

#Q176:185: Urban-rural
#Q175: Regions; "STRATUM" Variables consist of string combining region name and urban-rural information
STRATUM <- oD_lab[,which(substr(names(oD_lab),1,7) == "STRATUM")]
names(STRATUM) <- pew_names(x = STRATUM, start = 9)
STRATUM$id <- df$id
# Needs lots of cleaning
STRATUM$ARG <- ifelse(oD_lab$COUNTRY != "Argentina", NA, 
                      paste(stringr::str_trim(gsub(as.character(STRATUM$ARG),
                                                   pattern = " \\d.*", replacement = "")),
                            oD_lab$Q182ARG))
STRATUM$BRA <- ifelse(oD_lab$COUNTRY != "Brazil", NA,
                      paste(as.character(oD_lab$Q175BRA), oD_lab$Q181BRA))
STRATUM$CHL <- stringr::str_trim(as.character(STRATUM$CHL))
STRATUM$CHN <- ifelse(oD_lab$COUNTRY != "China", NA,
                      paste0(as.character(oD_lab$Q175BCHI), 
                             ifelse(oD_lab$Q182CHI == " Rural location  ", " - rural", " - urban")))
STRATUM$COL <- stringr::str_trim(as.character(STRATUM$COL))
STRATUM$DEU <- ifelse(oD_lab$COUNTRY != "Germany", NA, 
                      ifelse(oD_lab$Q179GER %in% c("Refused", "Don't know"), "NA",
                             paste0(mgsub::mgsub(as.character(oD_lab$STRATUM_GER),
                                                 pattern = c("\\-U.*", "\\-\\d.*"), replacement = c("", "")),
                                    ifelse(oD_lab$Q179GER == "Rural   ", " - rural", " - urban"))))
STRATUM$EGY <- stringr::str_trim(as.character(STRATUM$EGY))
STRATUM$ESP <- stringr::str_trim(mgsub::mgsub(as.character(STRATUM$ESP),
                                              pattern = c(" \\- Less.*", " \\- 2.000.*", " \\- 5.000.*", " \\- 10.000.*", " \\- 20.000.*", " \\- 50.000.*", " \\- 100.000.*", " \\- 500.000.*"),
                                              replacement = c("_rural", "_rural", "_rural", "_urban", "_urban", "_urban", "_urban", "_urban")))
STRATUM$FRA <- stringr::str_trim(ifelse(grepl(STRATUM$FRA, pattern = "Rural"),
                                        as.character(STRATUM$FRA),
                                        mgsub::mgsub(as.character(STRATUM$FRA),
                                                     pattern = c("\\-.*", " U.*"), replacement = c("-Urban", "-Urban"))))
STRATUM$GBR <- ifelse(oD_lab$COUNTRY != "United Kingdom", NA,
                      ifelse(oD_lab$Q176BRI %in% c("Don't know","Refused"), "NA", 
                             ifelse(is.na(oD_lab$STRATUM_BRI), "NA", 
                                    paste0(as.character(oD_lab$STRATUM_BRI),
                                           car::recode(oD_lab$Q176BRI, "'The countryside' = '_rural'; c('A city', 'A large town', 'A small town') = '_urban'")))))
STRATUM$GHA <- stringr::str_trim(mgsub::mgsub(as.character(STRATUM$GHA),
                                              pattern = c(" Towns/rural.*", " Towns with.*", " Cities.*", "  Cities.*"), replacement = c("_rural", "_urban", "_urban", "_urban")))
STRATUM$GRC <- stringr::str_trim(ifelse(STRATUM$GRC %in% c("Central Macedonia-Great Urban", "Thessaly-Great Urban"),
                                        gsub(as.character(STRATUM$GRC), pattern = "\\-G.*", replacement = "-urban"),
                                        mgsub::mgsub(as.character(STRATUM$GRC),
                                                     pattern = c("\\-G.*", "\\- G.*", "\\-S.*", " \\-S.*", " \\-U.*", " \\-R.*"), replacement = c("", "", "-urban", "-urban", "-urban", "-rural"))))
STRATUM$IDN <- stringr::str_trim(as.character(STRATUM$IDN))
STRATUM$IND <- mgsub::mgsub(as.character(STRATUM$IND),
                            pattern = c("\\-C.*", "\\-R.*", "\\,.*"), replacement = c("_urban", "_rural", ""))
STRATUM$ISR <- mgsub::mgsub(as.character(STRATUM$ISR),
                            pattern = c("\\, Rural.*", "\\, Urban.*", "\\, Metropolitan.*"), replacement = c("_rural", "_urban", "_urban"))
STRATUM$ITA <- stringr::str_trim(mgsub::mgsub(as.character(STRATUM$ITA),
                                              pattern = c(" \\d\\, 2.*", "\\, 2.*", "\\- up.*", "\\- \\d.*"), replacement = c("", "", "_rural", "_urban")))
STRATUM$JOR <- stringr::str_trim(as.character(STRATUM$JOR))
STRATUM$JPN <- mgsub::mgsub(as.character(STRATUM$JPN),
                            pattern = c("\\-Settlements with more.*", "\\-Settlements with fewer.*"), replacement = c("_urban", "_rural"))
STRATUM$KEN <- ifelse(oD_lab$COUNTRY != "Kenya", NA,
                      paste0(stringr::str_trim(oD_lab$Q175KEN), 
                             ifelse(oD_lab$Q181KEN == "Urban", " - urban", " - rural")))
STRATUM$KOR <- ifelse(oD_lab$COUNTRY != "South Korea", NA,
                      paste0(stringr::str_trim(oD_lab$Q175SKOR), 
                             car::recode(oD_lab$Q178SKOR, "'Rural location' = '_rural'; 'Small/medium city' = '_urban'; 'Metropolitan city' = ''")))
STRATUM$LBN <- stringr::str_trim(as.character(STRATUM$LBN))
STRATUM$MEX <- mgsub::mgsub(as.character(STRATUM$MEX), 
                            pattern = c(" R.*", " U.*"), replacement = c("_rural", "_urban"))
STRATUM$MYS <- stringr::str_trim(as.character(STRATUM$MYS))
STRATUM$NGA <- stringr::str_trim(mgsub::mgsub(as.character(STRATUM$NGA),
                                              pattern = c(" \\- Urban", " \\- Semi-urban", " \\- Rural"), replacement = c("_urban", "_rural", "_rural")))
STRATUM$PAK <- mgsub::mgsub(as.character(STRATUM$PAK),
                            pattern = c(" \\- Urban.*", " \\- Rural.*"), replacement = c("_urban", "_rural"))
STRATUM$PER <- stringr::str_trim(as.character(STRATUM$PER))
STRATUM$PHL <- stringr::str_trim(mgsub::mgsub(as.character(STRATUM$PHL),
                                              pattern = c(" \\- Urban", " \\-Urban", "\\- Rural", " \\- Rural"), replacement = c("_urban", "_urban", "_rural", "_rural")))
STRATUM$POL <- ifelse(oD_lab$COUNTRY != "Poland", NA,
                      paste0(stringr::str_trim(oD_lab$Q175POL), 
                             ifelse(oD_lab$Q180POL == "Urban", " - urban", " - rural")))
STRATUM$PSE <- ifelse(oD_lab$COUNTRY != "Palestinian territories", NA,
                      paste0(stringr::str_trim(oD_lab$Q175PAL), 
                             car::recode(oD_lab$Q181PAL, "'Urban (cities and towns)' = '_urban'; 'Rural (villages)' = '_rural'; 'Refugee camps' = '_refugee'")))
STRATUM$RUS <- ifelse(oD_lab$COUNTRY != "Russia", NA,
                      paste(oD_lab$Q175RUS, oD_lab$Q183RUS))
STRATUM$SEN <- stringr::str_trim(as.character(STRATUM$SEN))
STRATUM$SLV <- stringr::str_trim(as.character(STRATUM$SLV))
STRATUM$TUN <- stringr::str_trim(as.character(STRATUM$TUN))
STRATUM$TUR <- ifelse(oD_lab$COUNTRY != "Turkey", NA,
                      paste(oD_lab$Q175ATUR, oD_lab$Q181TUR))
STRATUM$UGA <- stringr::str_trim(as.character(STRATUM$UGA))
STRATUM$UKR <- ifelse(oD_lab$COUNTRY != "Ukraine", NA, 
                      paste(oD_lab$STRATUM_UKR, oD_lab$Q181UKR))
STRATUM$USA <- ifelse(oD_lab$COUNTRY != "United States", NA,
                      paste(oD_lab$Q175US, 
                            ifelse(oD_lab$USR == "Rural", 
                                   "Rural", "Urban"))) # US has no STRATUM variable but it has its own urban-rural variable, which is split into rural, suburban and urban. Adding suburban to urban fits best to urbanisation data.
STRATUM$VEN <- ifelse(oD_lab$COUNTRY != "Venezuela", NA,
                      paste0(stringr::str_trim(oD_lab$Q175VEN), 
                             ifelse(oD_lab$Q179VEN == "Urban", " - urban", " - rural")))
STRATUM$ZAF <- stringr::str_trim(mgsub::mgsub(as.character(STRATUM$ZAF),
                                              pattern = c(" \\- Urban-Urban", " \\- Non-Urban", " \\- Urban-Metro"), replacement = c("_urban", "_rural", "")))
# Combine into one single variable
STRATUM <- STRATUM %>%
  gather(-"id", key = "country_short", value = "region_name") %>%
  drop_na() %>%
  dplyr::left_join(.,readxl::read_excel("./original_data/region_matching.xlsx", sheet = "region", na = "NA"), by = c("region_name", "country_short"))

df <- dplyr::left_join(df, STRATUM[,c("id", "region_name", "region_code")], by = "id")


#### Education ####
# Each country has a different coding scheme
edu <- readxl::read_xlsx("./original_data/education_recoding.xlsx", na = "NA", trim_ws = F)

#Q138: What is the highest level of school you have attended?
Q138 <- oD_lab[,which(substr(names(oD_lab),1,4) == "Q138")]
names(Q138) <- pew_names(x = Q138, start = 5)
Q138A <- Q138[,which(names(Q138) %in% df$country_short)]
Q138A$id <- df$id
Q138A <- Q138A %>%
  gather(-"id", key = "country_short", value = "edu_raw") %>%
  drop_na() %>%
  dplyr::left_join(., edu, by = c("country_short", "edu_raw")) %>%
  mutate(edu3 = factor(ifelse(edu3 == "Ongoing", NA,
                              ifelse(is.na(edu3), NA, edu3))),
         edu2 = factor(ifelse(edu3 == "Tertiary", "Tertiary",
                              ifelse(is.na(edu3), NA, "Non-Tertiary"))))

df <- dplyr::left_join(df, Q138A[,c("id", "edu2", "edu3")], by = c("id"))


#### Economic Competitiveness ####
# Calculated accoring to Huber et al. (2022)
comp <- readRDS("./Original_data/comp.rds")

df <- dplyr::left_join(df, comp %>%
                         mutate(country_year = paste0(country_short, "_", year)) %>%
                         dplyr::select(country_short, country_year, district, competitiveness = srtc_lrca_trad_r), 
                       by = c("region_code" = "district", "country_short", "country_year"))

#### Country Development Level ####
# Current US Dollar

df <- wbstats::wb_data(country = unique(df$country_short), indicator = "NY.GDP.PCAP.CD",
                  start_date = 2012, end_date = 2012,
                  return_wide = T) %>%
  mutate(date = 2012,
         country_year = paste0(iso3c, "_", date)) %>%
  dplyr::rename('GDPpc' = NY.GDP.PCAP.CD) %>%
  dplyr::select(one_of("country_year", "GDPpc")) %>%
  dplyr::left_join(df, ., "country_year") %>%
  mutate(GDPpc = GDPpc / 10^3)

#### Subnational GNI per capita ####

shdi <- read.csv("./third_party/SHDI Complete 4.0 (1).csv") %>%
  filter(iso_code %in% df$country_short,
         year == 2012)

shdi_com <- dplyr::left_join(readxl::read_xlsx("./original_data/region_matching.xlsx", sheet = "shdi", trim_ws = F), 
                             shdi, 
                             by = c("shdi_code" = "GDLCODE", "shdi_name" = "region")) %>%
  dplyr::select(-c("region_code", "country", "year", "level", "continent")) %>%
  group_by(iso_code, region_name) %>%
  summarize(gnic = questionr::wtd.mean(x = gnic, weights = pop,  na.rm = T)) %>%
  mutate(lgnic = log(gnic))
# We weight by population because several administrative units may constitute one region in the PEW classification (e.g. for the USA, only West, North, South, and East are available).

df <- dplyr::left_join(df, shdi_com, by = c(c("country_short" = "iso_code"), "region_name"))

#### Save file ####

saveRDS(df, file = "./created_data/df.rds")
